Libraries

library(readr)
library(dplyr)
library(highcharter)
library(ggplot2)
library(ngram)
library(tidyr)
setwd("/Users/Apple/Documents/TaraFiles/University/term 8/Data Analysis/week_10/hw_10/")

1.

Ten poorest countries are found by their GDP per capita (current US$)“. The percentages of poor people are found by”Poverty headcount ratio at 3.20 a day (2011 PPP) (% of population)“.

WDICountry_Series=read_csv("./WDI_csv/WDICountry-Series.csv")
WDICountry=read_csv("./WDI_csv/WDICountry.csv")
WDIData=read_csv("./WDI_csv/WDIData.csv")
WDIFootNote=read_csv("./WDI_csv/WDIFootNote.csv")
WDISeries_Time=read_csv("./WDI_csv/WDISeries-Time.csv")
WDISeries=read_csv("./WDI_csv/WDISeries.csv")


WDIData$`Indicator Name`%>%unique()%>%as.data.frame()->a

WDIData%>%filter(`Indicator Name`=="GDP per capita (current US$)" |
                   `Indicator Name`=="Poverty headcount ratio at $3.20 a day (2011 PPP) (% of population)" |
                   `Indicator Name`=="Poverty headcount ratio at $3.20 a day (2011 PPP) (% of population)") %>%
  select(`Country Name`,`Country Code`,`2011`,`Indicator Name`) -> WDIData_q1



WDIData_q1%>%filter(`Indicator Name`=="GDP per capita (current US$)")%>%arrange(`2011`)%>%head(n=10L)
## # A tibble: 10 x 4
##    `Country Name`           `Country Code` `2011` `Indicator Name`        
##    <chr>                    <chr>           <dbl> <chr>                   
##  1 Burundi                  BDI              260. GDP per capita (current…
##  2 Ethiopia                 ETH              355. GDP per capita (current…
##  3 Congo, Dem. Rep.         COD              357. GDP per capita (current…
##  4 Niger                    NER              376. GDP per capita (current…
##  5 Liberia                  LBR              380. GDP per capita (current…
##  6 Sierra Leone             SLE              445. GDP per capita (current…
##  7 Madagascar               MDG              455. GDP per capita (current…
##  8 Central African Republic CAF              494. GDP per capita (current…
##  9 Malawi                   MWI              512. GDP per capita (current…
## 10 Gambia, The              GMB              514. GDP per capita (current…
WDIData_q1%>%filter(`Indicator Name`=="GDP per capita (current US$)")%>%arrange(`2011`)%>%
  select(`Country Name`)%>% .[1:10,] ->poorCOUNTRYcode

WDIData%>%filter(`Country Name` %in% poorCOUNTRYcode$`Country Name`)%>%
  filter(`Indicator Name`=="Poverty headcount ratio at national poverty lines (% of population)")%>%
  select(`Country Name`,`2008`,`2010`,`2011`,`2012`,`2014`)->Q1_poverty

MEAN=c(Q1_poverty[1,6],Q1_poverty[2,2],Q1_poverty[3,5],Q1_poverty[4,3],
       Q1_poverty[5,3],Q1_poverty[6,6],Q1_poverty[7,5],Q1_poverty[8,3],
       Q1_poverty[9,4],Q1_poverty[10,4])
Q1_poverty$value=as.numeric(MEAN)
Q1_poverty=Q1_poverty%>%select(`Country Name`,value)%>%mutate(type="poor percent")

WDIData%>%filter(`Country Name` %in% poorCOUNTRYcode$`Country Name`)%>%
  filter(`Indicator Name`=="Life expectancy at birth, total (years)")%>%
  select(`Country Name`,value=`2011`)%>%mutate(type="life_expectancy")->Q1_life


WDIData%>%filter(`Country Name` %in% poorCOUNTRYcode$`Country Name`)%>%
  filter(`Indicator Name`=="Adjusted net national income per capita (current US$)")%>%
  select(`Country Name`,value=`2011`)%>%mutate(type="AVGincome")->Q1_income

rbind(Q1_income,Q1_life,Q1_poverty)->q1



p1=ggplot(q1,aes(x=`Country Name`,y=value))+geom_bar(aes(fill = type),stat = "identity",position = "dodge",linetype = "dashed",color = "red",alpha=0.5)+
  xlab("country name")+ylab("value")+ggtitle("Q1")+theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
p1

***

2.

The Rwandan genocide, also known as the genocide against the Tutsi, was a genocidal mass slaughter of Tutsi in Rwanda by members of the Hutu majority government. An estimated 500,000 to 1,000,000 Rwandans were killed during the 100-day period from 7 April to mid-July 1994,constituting as many as 70% of the Tutsi population. (from wikipedia)

WDIData%>%filter(`Indicator Name`=="Life expectancy at birth, total (years)")->q2_life
q2_life=q2_life[,-c(1,3,4)]
q2_life=q2_life[,c(1,33:47)]%>%na.omit()

q2_life%>%filter(`Country Code`=="RWA")->RWA

RWA[,-1]%>%as.numeric()->RWA_pop
year=as.character(c(1991:2005))
data.frame(pop=RWA_pop,year=as.character(year))->RWA_data

rbind(q2_life%>%select(pop=`1991`)%>%mutate(year="1991"),
    q2_life%>%select(pop=`1992`)%>%mutate(year="1992"),
    q2_life%>%select(pop=`1993`)%>%mutate(year="1993"),
    q2_life%>%select(pop=`1994`)%>%mutate(year="1994"),
    q2_life%>%select(pop=`1995`)%>%mutate(year="1995"),
    q2_life%>%select(pop=`1996`)%>%mutate(year="1996"),
    q2_life%>%select(pop=`1997`)%>%mutate(year="1997"),
    q2_life%>%select(pop=`1998`)%>%mutate(year="1998"),
    q2_life%>%select(pop=`1999`)%>%mutate(year="1999"),
    q2_life%>%select(pop=`2000`)%>%mutate(year="2000"),
    q2_life%>%select(pop=`2001`)%>%mutate(year="2001"),
    q2_life%>%select(pop=`2002`)%>%mutate(year="2002"),
    q2_life%>%select(pop=`2003`)%>%mutate(year="2003"),
    q2_life%>%select(pop=`2004`)%>%mutate(year="2004"),
    q2_life%>%select(pop=`2005`)%>%mutate(year="2005"))->q2_life_box

hcboxplot(x = q2_life_box$pop, var = q2_life_box$year,outliers=F)%>%
  hc_add_series(RWA_data, "spline", hcaes(y = pop,x=year), name = "Rwanda")%>%
  hc_title(text="Rwanda Vs other countries")

3.

When countries are sorted by their life expectancy in 2011, their health expenditure per capita has a decreasing trend. The higher the health expenditure per capita, the higher is their life expectancy.

# Q3

WDIData%>%filter(`Indicator Name`=="Life expectancy at birth, total (years)")%>%
  select(`Country Name`,valueL=`2011`)->Q3_life

WDIData%>%filter(`Indicator Name`=="Current health expenditure per capita (current US$)")%>%
  select(`Country Name`,valueH=`2011`)->Q3_health
Q3_health$valueH=Q3_health$valueH/100

full_join(Q3_life,Q3_health,by="Country Name")%>%na.omit()%>%
  arrange(-valueL)%>%mutate(num=row_number())->Q3

rbind(Q3%>%select(num,value=valueH)%>%mutate(type="health expenditure in 100$"),
      Q3%>%select(num,value=valueL)%>%mutate(type="Life expectancy"))->Q3_plot


p3=ggplot(Q3_plot,aes(x=num,y=value))+geom_smooth(aes(color =type))+
  xlab("country")+ylab("value")+ggtitle("Q3")+theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
p3

***

4.

Roughly speaking, there has been an increasing trend in the last 50 years. However in the last 10 years it has generally decreased.

WDIData%>%filter(`Country Name`=="Iran, Islamic Rep.")%>%
  filter(`Indicator Name`=="Household final consumption expenditure per capita (constant 2010 US$)")->ir_h

ir_h=ir_h[,-c(1:4)]

data.frame(power=as.numeric(ir_h[-c(58,59)]),year=c(1960:2016))->q4

p4=ggplot(q4,aes(x=year,y=power))+geom_line()+
  xlab("year")+ylab("Household final consumption expenditure per capita")+ggtitle("Q4")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
p4

5.

a[sort(c(624,328,1546,1551,195,684,522,515,1060,573,677,71,517,511,69,616, 691 ,935)),]->index

as.data.frame(index)->index
WDIData%>%filter(`Country Name`=="Iran, Islamic Rep." )%>%
  filter(`Indicator Name`%in%index$index)%>%arrange(`Indicator Name`)->iran

WDIData%>%filter(`Country Name`=="World")%>%
  filter(`Indicator Name`%in%index$index)%>%arrange(`Indicator Name`)->WLD


WDIData%>%filter(`Country Name`=="Iran, Islamic Rep." | `Country Name`=="World")%>%
  filter(`Indicator Name`=="GDP per capita (constant 2010 US$)")->GDP

iran_GDP=data.frame(data=as.numeric(GDP[2,5:62]),year=1960:2017)
iran_GDP$data=iran_GDP$data/1000
WLD_GDP=data.frame(data=as.numeric(GDP[1,5:62]),year=1960:2017)
WLD_GDP$data=WLD_GDP$data/1000

lst1 = list()

for (i in 1:18) {
  
  data.frame(value=as.numeric(iran[i,5:62]),year=1960:2017)->x
  data.frame(value=as.numeric(WLD[i,5:62]),year=1960:2017)->x1
  hchart(x,"line", hcaes(y = value, x = year),name= concatenate("IRR ",as.character(iran[i,3])))%>%
    hc_add_series(x1, "spline", hcaes(y = value, x = year), name = concatenate("WLD ",as.character(iran[i,3])))%>%
    hc_title(text =as.character(iran[i,3]))%>%
    hc_add_theme(hc_theme_sandsignika())->hc1
  lst1[[i]] <- hc1
}

hchart(iran_GDP, "spline", hcaes(y = data, x = year), name = "IRR GDP per capita in 1000$")%>%
  hc_add_series(WLD_GDP, "spline", hcaes(y = data, x = year), name = "WLD GDP per capita in 1000$")%>%
  hc_title(text ="GDP")%>%
  hc_add_theme(hc_theme_sandsignika())->hc1

lst1[[19]] <- hc1
  
htmltools::tagList(lst1)

6.

WDIData%>%filter(`Indicator Name`%in%index$index)%>%arrange(`Country Code`,`Indicator Name`)%>%
  select(code=`Country Code`,data=`2005`,`Indicator Name`)%>%group_by(code)%>%
  mutate(indicator=row_number(code))%>%select(code,indicator,data)->q6

codeList=unique(q6$code)
#264
q6%>%filter(code==codeList[1])%>%arrange(indicator)->x
w=t(as.table(x$data))
for (i in 2:264) {
  q6%>%filter(code==codeList[i])%>%arrange(indicator)->x
  w1=t(as.table(x$data))
  w=rbind(w,w1)
  
  }

w=as.data.frame(w)
Q6_data=cbind(as.data.frame(codeList),w)
Q6_data[111,"N"]=Q6_data[258,"N"]
Q6_data%>%drop_na()->Q6_data
kcl = kmeans(Q6_data[,-1],centers = 3)
kcl
## K-means clustering with 3 clusters of sizes 7, 18, 27
## 
## Cluster means:
##          A         B        C         D        E        F         G
## 1 1.584334  2.749674 2.416700 1758.9597 8.931237 3.927011 3.4174204
## 2 1.798439  1.584434 2.530723 3797.0983 7.431177 6.133619 9.1851387
## 3 1.942006 10.277547 7.548247  262.3142 8.780996 4.663879 0.8680279
##          H        I         J         K        L        M         N
## 1 85.68544 12.25412 12.987823 15775.728 25.20177 2.469418  4.302026
## 2 74.90950 12.62080 21.136976 25308.283 27.49952 1.858166 10.636841
## 3 74.09277 14.48245  8.944539  3430.144 32.63414 5.958033  2.929275
##          O        P         Q        R
## 1 2.377591 97.59748  9.679224 8.275978
## 2 1.398093 72.27502  6.559527 6.186059
## 3 1.997948 95.49833 10.648896 8.977103
## 
## Clustering vector:
##   9  12  16  20  24  28  35  36  52  57  64  66  67  70  72  74  76  80 
##   3   2   2   3   3   3   3   2   1   2   1   3   2   3   1   2   2   2 
##  88  94 100 110 111 114 115 118 138 142 149 166 169 175 176 180 183 185 
##   1   2   3   2   3   1   2   2   3   3   3   3   2   2   2   2   3   3 
## 186 189 193 197 200 203 210 221 222 230 239 242 247 249 258 262 
##   3   3   1   2   3   3   3   1   2   3   3   3   3   3   3   3 
## 
## Within cluster sum of squares by cluster:
## [1]  54023434 423536041 133751826
##  (between_SS / total_SS =  89.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
plot(Q6_data,col = kcl$cluster,pch = 20,cex = 2)

Q6=data.frame(name=Q6_data[,1],cluster=kcl$cluster)
ir_cls_num=Q6$cluster[which(as.character(Q6$name)=="IRN")]
Q6%>%filter(cluster==ir_cls_num)->IRN_clus
WDIData%>%filter(`Country Code`%in% IRN_clus$name)%>%select(`Country Name`)%>%unique()->IRN_clus_names

print(IRN_clus_names)
## # A tibble: 27 x 1
##    `Country Name`                              
##    <chr>                                       
##  1 Central Europe and the Baltics              
##  2 Europe & Central Asia (IDA & IBRD countries)
##  3 Lower middle income                         
##  4 South Asia                                  
##  5 South Asia (IDA & IBRD)                     
##  6 World                                       
##  7 Armenia                                     
##  8 Belarus                                     
##  9 Brazil                                      
## 10 Bulgaria                                    
## # ... with 17 more rows

7.

Most of countries are in one cluster, and only a small portion of them are in other 2. The clustring result is satisfying.

# Q7

library(ggbiplot)
Q7_pca = prcomp(Q6_data[,-1], scale. = TRUE)
ggbiplot(Q7_pca, obs.scale = 1, var.scale = 1,
         groups =as.character( kcl$cluster), ellipse = TRUE, circle = F) +
  scale_color_discrete(name = '') +
  theme(legend.direction = 'horizontal', legend.position = 'top')

PCA = Q7_pca$sdev^2

qcc::pareto.chart(PCA)

##    
## Pareto chart analysis for PCA
##        Frequency    Cum.Freq.   Percentage Cum.Percent.
##   A   5.25846322   5.25846322  29.21368457  29.21368457
##   B   2.21799589   7.47645911  12.32219939  41.53588395
##   C   1.95400125   9.43046036  10.85556248  52.39144643
##   D   1.53212686  10.96258722   8.51181590  60.90326233
##   E   1.30165405  12.26424127   7.23141138  68.13467371
##   F   1.13687394  13.40111521   6.31596634  74.45064005
##   G   0.91562860  14.31674381   5.08682554  79.53746559
##   H   0.72901719  15.04576100   4.05009551  83.58756111
##   I   0.63998675  15.68574775   3.55548196  87.14304307
##   J   0.62145722  16.30720497   3.45254012  90.59558319
##   K   0.48582936  16.79303433   2.69905198  93.29463517
##   L   0.36092562  17.15395995   2.00514232  95.29977749
##   M   0.25461248  17.40857243   1.41451379  96.71429128
##   N   0.24191418  17.65048661   1.34396766  98.05825894
##   O   0.16993982  17.82042643   0.94411013  99.00236907
##   P   0.11401231  17.93443874   0.63340172  99.63577079
##   Q   0.05056338  17.98500212   0.28090767  99.91667845
##   R   0.01499788  18.00000000   0.08332155 100.00000000

9.

Nine comes sooner than eight here!

iran[,40:62]->ir_Q8
ir_Q8=as.data.frame(t(as.matrix(ir_Q8)))
ir_Q8$growth=iran_GDP[36:58,1]

library(h2o)
h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         18 minutes 28 seconds 
##     H2O cluster version:        3.16.0.2 
##     H2O cluster version age:    7 months and 13 days !!! 
##     H2O cluster name:           H2O_started_from_R_Tara_tqw710 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.32 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         XGBoost, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.4.3 (2017-11-30)
happly = as.h2o(ir_Q8)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
hglm = h2o.glm(y = "growth", x= c("V1","V2","V3","V4","V5","V6",
                                  "V7","V8","V9","V10","V11","V12",
                                  "V13","V14","V15","V16","V17","V18"),
               training_frame = happly, family="gaussian")
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
hglm
## Model Details:
## ==============
## 
## H2ORegressionModel: glm
## Model ID:  GLM_model_R_1531547753200_2 
## GLM Model: summary
##     family     link                              regularization
## 1 gaussian identity Elastic Net (alpha = 0.5, lambda = 0.1456 )
##   number_of_predictors_total number_of_active_predictors
## 1                         17                           3
##   number_of_iterations training_frame
## 1                    1          ir_Q8
## 
## Coefficients: glm coefficients
##        names coefficients standardized_coefficients
## 1  Intercept     2.072005                  5.551278
## 2         V1     0.000000                  0.000000
## 3         V2    -0.001817                 -0.003502
## 4         V3     0.000000                  0.000000
## 5         V4     0.000000                  0.000000
## 6         V5     0.000000                  0.000000
## 7         V6     0.000000                  0.000000
## 8         V7     0.000000                  0.000000
## 9         V8     0.000000                  0.000000
## 10        V9     0.000000                  0.000000
## 11       V10     0.000000                  0.000000
## 12       V11     0.001499                  0.662405
## 13       V12     0.000000                  0.000000
## 14       V13    -0.000192                 -0.001890
## 15       V14     0.000000                  0.000000
## 16       V15     0.000000                  0.000000
## 17       V17     0.000000                  0.000000
## 18       V18     0.000000                  0.000000
## 
## H2ORegressionMetrics: glm
## ** Reported on training data. **
## 
## MSE:  0.04870777
## RMSE:  0.2206984
## MAE:  0.1725157
## RMSLE:  0.03317761
## Mean Residual Deviance :  0.04870777
## R^2 :  0.9238002
## Null Deviance :14.06265
## Null D.o.F. :21
## Residual Deviance :1.071571
## Residual D.o.F. :18
## AIC :5.951126

8.

Most of countries are in one cluster, and only a small portion of them are in other 2.

# Q9
health=a[sort(c(57,139,51,479,332,646,775,1235,1251,1078,1323,1131)),]
edu=a[sort(c(572,574,759,1260,1356,1350,1328,1329,1259,1260,1288,1289,598,599)),]

Health related criteria are:

health
##  [1] Adolescent fertility rate (births per 1,000 women ages 15-19)
##  [2] Age dependency ratio (% of working-age population)           
##  [3] Birth rate, crude (per 1,000 people)                         
##  [4] Death rate, crude (per 1,000 people)                         
##  [5] Fertility rate, total (births per woman)                     
##  [6] Immunization, DPT (% of children ages 12-23 months)          
##  [7] Life expectancy at birth, total (years)                      
##  [8] Population ages 0-14 (% of total)                            
##  [9] Population growth (annual %)                                 
## [10] Prevalence of anemia among children (% of children under 5)  
## [11] Prevalence of undernourishment (% of population)             
## [12] Refugee population by country or territory of origin         
## 1591 Levels: 2005 PPP conversion factor, GDP (LCU per international $) ...

Education related criteria are:

edu
##  [1] Government expenditure on education, total (% of GDP)                                   
##  [2] Government expenditure per student, primary (% of GDP per capita)                       
##  [3] Gross intake ratio in first grade of primary education, female (% of relevant age group)
##  [4] Gross intake ratio in first grade of primary education, male (% of relevant age group)  
##  [5] Labor force, total                                                                      
##  [6] Primary completion rate, female (% of relevant age group)                               
##  [7] Primary completion rate, male (% of relevant age group)                                 
##  [8] Primary completion rate, male (% of relevant age group)                                 
##  [9] Progression to secondary school, female (%)                                             
## [10] Progression to secondary school, male (%)                                               
## [11] Repeaters, primary, female (% of female enrollment)                                     
## [12] Repeaters, primary, male (% of male enrollment)                                         
## [13] School enrollment, preprimary (% gross)                                                 
## [14] School enrollment, primary and secondary (gross), gender parity index (GPI)             
## 1591 Levels: 2005 PPP conversion factor, GDP (LCU per international $) ...
# health

### 5
as.data.frame(health)->health
WDIData%>%filter(`Country Name`=="Iran, Islamic Rep." )%>%
  filter(`Indicator Name`%in%health$health)%>%arrange(`Indicator Name`)->iran

WDIData%>%filter(`Country Name`=="World")%>%
  filter(`Indicator Name`%in%health$health)%>%arrange(`Indicator Name`)->WLD


lst2 = list()

for (i in 1:12) {
  
  data.frame(value=as.numeric(iran[i,5:62]),year=1960:2017)->x
  data.frame(value=as.numeric(WLD[i,5:62]),year=1960:2017)->x1
  hchart(x,"line", hcaes(y = value, x = year),name= concatenate("IRR ",as.character(iran[i,3])))%>%
    hc_add_series(x1, "spline", hcaes(y = value, x = year), name = concatenate("WLD ",as.character(iran[i,3])))%>%
    hc_title(text =as.character(iran[i,3]))%>%
    hc_add_theme(hc_theme_sandsignika())->hc1
  lst2[[i]] <- hc1
}

htmltools::tagList(lst2)
### Q6


WDIData%>%filter(`Indicator Name`%in% health$health)%>%arrange(`Country Code`,`Indicator Name`)%>%
  select(code=`Country Code`,data=`2011`,`Indicator Name`)%>%group_by(code)%>%
  mutate(indicator=row_number(code))%>%select(code,indicator,data)->q6

codeList=unique(q6$code)
#264
q6%>%filter(code==codeList[1])%>%arrange(indicator)->x
w=t(as.table(x$data))
for (i in 2:264) {
  q6%>%filter(code==codeList[i])%>%arrange(indicator)->x
  w1=t(as.table(x$data))
  w=rbind(w,w1)
  
}

w=as.data.frame(w)
Q6_data=cbind(as.data.frame(codeList),w)%>%na.omit()
kcl = kmeans(Q6_data[,-1],centers = 3)
kcl
## K-means clustering with 3 clusters of sizes 8, 186, 14
## 
## Cluster means:
##          A        B        C        D        E        F        G        H
## 1 83.48074 73.73819 30.28923 8.698515 3.954039 79.48215 64.14677 36.91023
## 2 50.83986 58.42074 21.28926 8.125110 2.759095 89.74555 70.98903 27.89793
## 3 64.82178 67.37208 27.69822 7.864780 3.584872 81.67857 66.15822 34.59174
##          I        J        K          L
## 1 2.084558 54.53991 18.32039 7825597.88
## 2 1.346483 34.67623 11.03765   87349.99
## 3 1.986412 48.40044 15.10230 3115756.00
## 
## Clustering vector:
##   2   3   4   6   7   8   9  11  12  13  14  16  17  18  19  20  22  23 
##   3   2   2   3   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  24  25  27  28  29  30  32  33  34  35  36  38  39  40  41  43  44  46 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  47  48  49  52  53  54  55  57  58  59  60  61  62  63  64  65  66  67 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  69  70  71  72  74  75  76  79  80  81  82  84  85  86  88  89  91  93 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  94  96  97  99 100 101 102 103 104 105 106 108 110 111 112 113 114 115 
##   2   2   1   2   2   3   1   1   2   2   1   2   2   2   2   2   2   2 
## 116 117 118 119 120 121 122 123 125 126 127 128 129 130 132 133 134 135 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   1   3 
## 137 138 139 140 141 142 144 147 149 150 151 152 153 155 156 157 158 159 
##   2   3   1   2   2   2   2   2   2   2   2   3   2   3   2   2   2   2 
## 160 161 162 164 165 166 167 168 169 170 172 173 174 175 176 177 179 180 
##   3   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 181 182 183 184 185 186 189 190 192 193 194 196 197 200 201 202 203 204 
##   2   2   2   2   2   2   2   1   2   2   2   2   2   2   2   2   3   2 
## 206 208 209 210 213 214 216 217 218 219 220 221 222 223 228 229 230 231 
##   2   2   2   2   2   3   3   2   2   2   2   2   2   2   2   2   2   2 
## 232 233 234 235 236 237 239 240 241 242 243 245 246 247 248 249 250 251 
##   2   2   2   2   2   2   3   3   2   2   2   2   2   2   3   2   2   2 
## 252 253 256 257 258 259 261 262 263 264 
##   2   2   2   2   1   2   2   2   2   2 
## 
## Within cluster sum of squares by cluster:
## [1] 2.504311e+13 1.245033e+13 1.335006e+13
##  (between_SS / total_SS =  91.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
plot(Q6_data,col = kcl$cluster,pch = 20,cex = 2)

Q6=data.frame(name=Q6_data[,1],cluster=kcl$cluster)
ir_cls_num=Q6$cluster[which(as.character(Q6$name)=="IRN")]
Q6%>%filter(cluster==ir_cls_num)->IRN_clus
WDIData%>%filter(`Country Code`%in% IRN_clus$name)%>%select(`Country Name`)%>%unique()->IRN_clus_names

print(IRN_clus_names)
## # A tibble: 186 x 1
##    `Country Name`                               
##    <chr>                                        
##  1 Caribbean small states                       
##  2 Central Europe and the Baltics               
##  3 Early-demographic dividend                   
##  4 East Asia & Pacific                          
##  5 East Asia & Pacific (excluding high income)  
##  6 East Asia & Pacific (IDA & IBRD countries)   
##  7 Euro area                                    
##  8 Europe & Central Asia                        
##  9 Europe & Central Asia (excluding high income)
## 10 Europe & Central Asia (IDA & IBRD countries) 
## # ... with 176 more rows
### Q7
Q7_pca = prcomp(Q6_data[,-1], scale. = TRUE)
ggbiplot(Q7_pca, obs.scale = 1, var.scale = 1,
         groups =as.character( kcl$cluster), ellipse = TRUE, circle = F) +
  scale_color_discrete(name = '') +
  theme(legend.direction = 'horizontal', legend.position = 'top')

PCA = Q7_pca$sdev^2
#names(PCA) = paste0('PC', eigv$PCs)
qcc::pareto.chart(PCA)

##    
## Pareto chart analysis for PCA
##        Frequency    Cum.Freq.   Percentage Cum.Percent.
##   A 7.688220e+00 7.688220e+00 6.406850e+01 6.406850e+01
##   B 1.421548e+00 9.109767e+00 1.184623e+01 7.591473e+01
##   C 9.770108e-01 1.008678e+01 8.141757e+00 8.405648e+01
##   D 5.668615e-01 1.065364e+01 4.723846e+00 8.878033e+01
##   E 4.602102e-01 1.111385e+01 3.835085e+00 9.261542e+01
##   F 3.189776e-01 1.143283e+01 2.658146e+00 9.527356e+01
##   G 2.401360e-01 1.167296e+01 2.001134e+00 9.727470e+01
##   H 1.854594e-01 1.185842e+01 1.545495e+00 9.882019e+01
##   I 9.714883e-02 1.195557e+01 8.095736e-01 9.962976e+01
##   J 3.388960e-02 1.198946e+01 2.824133e-01 9.991218e+01
##   K 6.965037e-03 1.199643e+01 5.804198e-02 9.997022e+01
##   L 3.573705e-03 1.200000e+01 2.978087e-02 1.000000e+02
#####################################################################
#####################################################################

# EDU

### 5
as.data.frame(edu)->edu
WDIData%>%filter(`Country Name`=="Iran, Islamic Rep." )%>%
  filter(`Indicator Name`%in%edu$edu)%>%arrange(`Indicator Name`)->iran

WDIData%>%filter(`Country Name`=="World")%>%
  filter(`Indicator Name`%in%edu$edu)%>%arrange(`Indicator Name`)->WLD


lst3 = list()

for (i in 1:13) {
  
  data.frame(value=as.numeric(iran[i,5:62]),year=1960:2017)->x
  data.frame(value=as.numeric(WLD[i,5:62]),year=1960:2017)->x1
  hchart(x,"line", hcaes(y = value, x = year),name= concatenate("IRR ",as.character(iran[i,3])))%>%
    hc_add_series(x1, "spline", hcaes(y = value, x = year), name = concatenate("WLD ",as.character(iran[i,3])))%>%
    hc_title(text =as.character(iran[i,3]))%>%
    hc_add_theme(hc_theme_sandsignika())->hc1
  lst3[[i]] <- hc1
}

htmltools::tagList(lst3)
### Q6


WDIData%>%filter(`Indicator Name`%in% edu$edu)%>%arrange(`Country Code`,`Indicator Name`)%>%
  select(code=`Country Code`,data=`2011`,`Indicator Name`)%>%group_by(code)%>%
  mutate(indicator=row_number(code))%>%select(code,indicator,data)->q6

codeList=unique(q6$code)
#264
q6%>%filter(code==codeList[1])%>%arrange(indicator)->x
w=t(as.table(x$data))
for (i in 2:264) {
  q6%>%filter(code==codeList[i])%>%arrange(indicator)->x
  w1=t(as.table(x$data))
  w=rbind(w,w1)
  
}

w=as.data.frame(w)
Q6_data=cbind(as.data.frame(codeList),w)%>%na.omit()
kcl = kmeans(Q6_data[,-1],centers = 3)
kcl
## K-means clustering with 3 clusters of sizes 59, 4, 12
## 
## Cluster means:
##          A        B        C        D          E        F        G
## 1 4.893564 17.48515 103.0388 105.5061   41595917 90.22627 91.36756
## 2 4.192131 13.70823 107.4102 109.5405 2653954658 90.19385 91.90959
## 3 3.963129 12.56628 108.3180 113.3255  591662012 80.24198 83.64485
##          H        I        J        K        L         M
## 1 93.20675 93.81361 5.064084 5.359394 65.86780 0.9761970
## 2 91.83340 92.02502 4.847448 4.359955 37.84992 0.9751325
## 3 86.53433 87.01977 6.259462 5.736092 36.98487 0.9469842
## 
## Clustering vector:
##   8  13  15  20  27  31  35  36  38  41  44  46  47  48  52  53  57  64 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   3 
##  65  67  70  71  72  74  82  91  97 101 102 103 106 111 113 114 115 127 
##   1   1   1   1   1   1   1   1   1   2   2   3   3   1   1   1   1   1 
## 128 133 134 135 137 139 141 142 144 149 153 158 166 172 180 182 184 185 
##   1   1   3   1   1   2   3   1   1   1   1   1   1   1   3   1   1   1 
## 190 194 200 203 206 209 213 214 216 217 220 221 222 223 228 235 239 240 
##   1   1   1   3   1   1   1   3   3   1   1   1   1   1   1   1   3   3 
## 247 248 258 
##   1   3   2 
## 
## Within cluster sum of squares by cluster:
## [1] 4.249520e+17 6.115389e+17 1.210090e+18
##  (between_SS / total_SS =  92.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
plot(Q6_data,col = kcl$cluster,pch = 20,cex = 2)

Q6=data.frame(name=Q6_data[,1],cluster=kcl$cluster)
ir_cls_num=Q6$cluster[which(as.character(Q6$name)=="IRN")]
Q6%>%filter(cluster==ir_cls_num)->IRN_clus
WDIData%>%filter(`Country Code`%in% IRN_clus$name)%>%select(`Country Name`)%>%unique()->IRN_clus_names

print(IRN_clus_names)
## # A tibble: 59 x 1
##    `Country Name`                                      
##    <chr>                                               
##  1 Caribbean small states                              
##  2 Central Europe and the Baltics                      
##  3 Euro area                                           
##  4 European Union                                      
##  5 Heavily indebted poor countries (HIPC)              
##  6 Latin America & Caribbean                           
##  7 Latin America & Caribbean (excluding high income)   
##  8 Latin America & the Caribbean (IDA & IBRD countries)
##  9 Low income                                          
## 10 Other small states                                  
## # ... with 49 more rows
### Q7
Q7_pca = prcomp(Q6_data[,-1], scale. = TRUE)
ggbiplot(Q7_pca, obs.scale = 1, var.scale = 1,
         groups =as.character( kcl$cluster), ellipse = TRUE, circle = F) +
  scale_color_discrete(name = '') +
  theme(legend.direction = 'horizontal', legend.position = 'top')

PCA = Q7_pca$sdev^2
#names(PCA) = paste0('PC', eigv$PCs)
qcc::pareto.chart(PCA)

##    
## Pareto chart analysis for PCA
##        Frequency    Cum.Freq.   Percentage Cum.Percent.
##   A 7.370078e+00 7.370078e+00 5.669291e+01 5.669291e+01
##   B 1.410093e+00 8.780171e+00 1.084687e+01 6.753978e+01
##   C 1.173857e+00 9.954028e+00 9.029672e+00 7.656945e+01
##   D 9.567394e-01 1.091077e+01 7.359534e+00 8.392898e+01
##   E 6.604114e-01 1.157118e+01 5.080088e+00 8.900907e+01
##   F 4.969341e-01 1.206811e+01 3.822570e+00 9.283164e+01
##   G 3.541847e-01 1.242230e+01 2.724498e+00 9.555614e+01
##   H 3.114850e-01 1.273378e+01 2.396038e+00 9.795217e+01
##   I 1.629921e-01 1.289677e+01 1.253785e+00 9.920596e+01
##   J 4.951025e-02 1.294629e+01 3.808481e-01 9.958681e+01
##   K 3.495632e-02 1.298124e+01 2.688948e-01 9.985570e+01
##   L 1.497222e-02 1.299621e+01 1.151709e-01 9.997087e+01
##   M 3.786438e-03 1.300000e+01 2.912644e-02 1.000000e+02

10.

as.data.frame(c(as.character(health[,1]),as.character(edu[,1]),as.character(index[,1])))%>%unique()->all
names(all)="all"
WDIData%>%filter(`Indicator Name`%in% all$all)%>%arrange(`Country Code`,`Indicator Name`)%>%
  select(code=`Country Code`,data=`2011`,`Indicator Name`)%>%group_by(code)%>%
  mutate(indicator=row_number(code))%>%select(code,indicator,data)->q10

codeList=unique(q10$code)
#264
q10%>%filter(code==codeList[1])%>%arrange(indicator)->x
w=t(as.table(x$data))
for (i in 2:264) {
  q10%>%filter(code==codeList[i])%>%arrange(indicator)->x
  w1=t(as.table(x$data))
  w=rbind(w,w1)
  
}

w=as.data.frame(w)
Q10_data=cbind(as.data.frame(codeList),w)
Q10_data=Q10_data[,-c(25,28)]
Q10_data=Q10_data%>%na.omit()
row.names(Q10_data)=Q10_data$codeList
Q10_data=Q10_data[,-1]
dist = stats::dist(Q10_data,method = "euclidean")
clus = hclust(dist,method = "complete")
plot(clus)
rect.hclust(clus, 3)

11.

a.

Prevalence of anemia among children has constantly decresed in iran. it was as high as 50% just 30 years ago, and is currently less than 30%. Switzerland is one of the ,ost developed and wealthiest countries. Its rate is substantially lower. However, since 2010 the Prevalence of anemia has increased by 3% which is realy odd for such a country!

#"Prevalence of anemia among children"

WDIData%>%filter(`Country Name`=="Iran, Islamic Rep."  )%>%
  filter(`Indicator Name`==a[1235,])->q11_1_ir
q11_1_ir=data.frame(data=as.numeric(q11_1_ir[,5:62]),year=1960:2017)
WDIData%>%filter(`Country Name`=="Switzerland" )%>%
  filter(`Indicator Name`==a[1235,])->q11_1_ch
q11_1_ch=data.frame(data=as.numeric(q11_1_ch[,5:62]),year=1960:2017)
WDIData%>%filter( `Country Name`=="World" )%>%
  filter(`Indicator Name`==a[1235,])->q11_1_wld
q11_1_wld=data.frame(data=as.numeric(q11_1_wld[,5:62]),year=1960:2017)

hchart(q11_1_ir,"line", hcaes(y = data, x = year),name="IRR")%>%
  hc_add_series(q11_1_wld, "spline", hcaes(y = data, x = year), name ="WLD")%>%
  hc_add_series(q11_1_ch, "spline", hcaes(y = data, x = year), name ="CHF")%>%
  hc_title(text =a[1235,])%>%
  hc_add_theme(hc_theme_sandsignika())

b.

The immunization against measles rate for Iran is one of the highest. Sunprisingly, the immunization rate for US and switzerland are lower than Iran! I googled it and found that it is not a compulsory vaccination in those two country. In addition to that, there are some debates whether measles vaccine is related to autism in the next generation.

“Measles vaccination is an important indicator of hesitancy and refusal, since misinformation accusing it of causing autism, despite evidence of its fraudulent source, is still going on.”

If you google the autism rate, it is 1.9% for Iran and 1.7% for US. Due to the fact that there are probably many austism cases in Iran that have not been counted, I wonder whether such speculation about measles vaccine can be proved true in future!

WDIData%>%filter(`Country Name`=="Iran, Islamic Rep."  )%>%
  filter(`Indicator Name`==a[648,])->q11_2_ir
q11_2_ir=data.frame(data=as.numeric(q11_2_ir[,5:62]),year=1960:2017)
WDIData%>%filter(`Country Name`=="Switzerland" )%>%
  filter(`Indicator Name`==a[648,])->q11_2_ch
q11_2_ch=data.frame(data=as.numeric(q11_2_ch[,5:62]),year=1960:2017)
WDIData%>%filter( `Country Name`=="United States" )%>%
  filter(`Indicator Name`==a[648,])->q11_2_US
q11_2_US=data.frame(data=as.numeric(q11_2_US[,5:62]),year=1960:2017)

hchart(q11_2_ir,"line", hcaes(y = data, x = year),name="IRR")%>%
  hc_add_series(q11_2_US, "spline", hcaes(y = data, x = year), name ="US")%>%
  hc_add_series(q11_2_ch, "spline", hcaes(y = data, x = year), name ="CHF")%>%
  hc_title(text =a[648,])%>%
  hc_add_theme(hc_theme_sandsignika())

c.

Ratio of girls to boys is “gender parity index”. Iran’s index has the highest growth and is now above 1. US’s index has been constantly above 1, but is now a little lower than Iran, I believe such a large growth is a sign of modernization of society in a developing country.

WDIData%>%filter(`Country Name`=="Iran, Islamic Rep."  )%>%
  filter(`Indicator Name`==a[1356,])->q11_3_ir
q11_3_ir=data.frame(data=as.numeric(q11_3_ir[,5:62]),year=1960:2017)
WDIData%>%filter(`Country Name`=="Switzerland" )%>%
  filter(`Indicator Name`==a[1356,])->q11_3_ch
q11_3_ch=data.frame(data=as.numeric(q11_3_ch[,5:62]),year=1960:2017)
WDIData%>%filter( `Country Name`=="United States" )%>%
  filter(`Indicator Name`==a[1356,])->q11_3_US
q11_3_US=data.frame(data=as.numeric(q11_3_US[,5:62]),year=1960:2017)

hchart(q11_3_ir,"line", hcaes(y = data, x = year),name="IRR")%>%
  hc_add_series(q11_3_US, "spline", hcaes(y = data, x = year), name ="US")%>%
  hc_add_series(q11_3_ch, "spline", hcaes(y = data, x = year), name ="CHF")%>%
  hc_title(text =a[1356,])%>%
  hc_add_theme(hc_theme_sandsignika())